#Importing Data Set
ss14pusa = read.csv(file.choose(),header=TRUE) # read csv file
ss14pusb = read.csv(file.choose(),header=TRUE)
#Libraries
require(plyr)
require(dplyr)
require(rbokeh)
require(ggplot2)
#Importing Data Set in a faster way
library(data.table)
variables=c("SEX","OCCP","WKHP","MAR","WAGP","SCHL","ESP","RAC1P","ST","PAOC","PWGTP")
ss14pusa=fread("~/Google Drive/Data for R/csv_pus/ss14pusa.csv",head=TRUE,select=variables)
ss14pusb=fread("~/Google Drive/Data for R/csv_pus/ss14pusb.csv",head=TRUE,select=variables)
attach(ss14pusa)
names(ss14pusa)
<<<<<<< HEAD
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
detach(ss14pusa)
attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
=======
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusa)
attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
>>>>>>> master
detach(ss14pusb)
Data = rbind(ss14pusa_edit,ss14pusb_edit)
colnames(Data) <- c("Gender","Occupation", "Work_hours", "Marriage", "Income", "Education" ,"Parental_Occupation", "Race", "State", "Children", "Weight", "Age")
write.csv(Data, file = "Data.csv",row.names=TRUE)
#Delete income=0/NA rows
row_to_keep=which(Data$Income>0)
Data=Data[row_to_keep,]
#Recode Region
#https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
<<<<<<< HEAD
Data$Region <- recode(Data$State,c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46)='Mid West'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53)='West'))
=======
require(car)
Data$Region <- recode(Data$State,"c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46,39)='Midwest'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53,56)='West';'72'='Puerto Rico'")
# Recode Division
Data$Division <- recode(Data$State,"c(53,41,6,2,15)='Pacific';c(4,8,16,30,32,35,49,56)='Mountain'; c(19,20,27,29,31,38,46)='West North Central';c(17,18,26,39,55)='East North Central';c(5,22,40,48)='West South Central';c(1,21,28,47)='East South Central';c(34,36,42)='Middle Atlantic';c(10,11,12,13,24,37,45,51,54)='South Atlantic';c(9,23,25,33,44,50)='New England'")
master
#Recode Education
Data$Education <- recode(Data$Education,"c(01,02,03,04,05,06,07,08,09,10,11)='~greade8';c(12,13,14,15,16,17,18,19)='grade9~college_nodegree';c(20,21)='associate/bachelor';c(22,23)='master/professional';c(24)='doctor')")
#Recode Marriage
Data$Marriage <- as.factor(Data$Marriage)
levels(Data$Marriage) <- c("Married", "Widowed", "Divorced", "Seperated", "Never Married")
#Recode Education
library(car)
Data$Education <- recode(Data$Education,"1:11='~greade8';12:19='grade9~college_nodegree';20:21='associate/bachelor';22:23='master/professional';24='doctor'")
summary(lm(MEAN ~ Marriage + Age, data = Data_wed_SCI))
Women_MGR<-filter(Data_women, Occupation ==“MGR”) Data_wed_MGR <- ddply(Women_MGR, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_wed_MGR,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)
mgr <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_MGR, color = Marriage, glyph = as.factor(Marriage), hover = list(Marriage, Age)) mgr
Women_LGL<-filter(Data_women, Occupation ==“LGL”) Data_wed_LGL <- ddply(Women_LGL, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_wed_LGL,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)
lgl <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_LGL, color = Marriage, glyph = as.factor(Marriage), hover = list(Marriage, Age)) lgl
summary(lm(MEAN ~ Marriage + Age, data = Data_wed_LGL))
Women_BUS<-filter(Data_women, Occupation ==“BUS”) Data_wed_BUS <- ddply(Women_BUS, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_wed_BUS,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)
bus <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_BUS, color = Marriage, hover = list(Marriage, Age)) bus
summary(lm(MEAN ~ Marriage + Age, data = Data_wed_BUS))
Women_CMM<-filter(Data_women, Occupation ==“CMM”) Data_wed_CMM <- ddply(Women_CMM, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_wed_CMM,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)
cmm <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_CMM, color = Marriage, hover = list(Marriage, Age))
cmm
summary(lm(MEAN ~ Marriage + Age, data = Data_wed_CMM))
bounds <- range(c(Data_wed_occp\(Marriage, Data_wed_occp\)MEAN ))
grid_plot(lapply(split(Data_wed_occp, as.factor(Data_wed_occp$Occupation)), function(d) { figure(width = 300, height = 350) %>% ly_points(Age, MEAN, data = Data_wed_occp, color = Marriage, hover = list(Marriage, Age)) }), nrow = 2, same_axes = TRUE)
ggplot(Data_wed_occp,aes(x = factor(Occupation), fill = factor(Marriage), y = MEAN)) + geom_bar(stat = “identity”) + geom_text(position = “stack”,aes(y = MEAN, label = Marriage), size = 2, check_overlap = TRUE)
require(plotly) p <- plot_ly(Data_wed_occp, x = Age, y = MEAN,group = Occupation, text = paste(“Clarity:”, Age), mode = “markers”, xaxis = paste0(“x”, Occupation), symbol = Marriage, jitter = .9)
p<- subplot(p) ``` ???
master
MotherWorking=select(Data_women, Income,Children,Work_hours)
#detach(package:plyr)
GroupedMotherWorking <-
MotherWorking %>%
na.omit() %>%
group_by(Children,Work_hours) %>%
summarize(
AvgIncome = mean(Income),
count=n()
)
library(plotly)
# note how size is automatically scaled and added as hover text
plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome,size=sqrt(count), color = Children,text = paste("Count: ", count),opacity=1-Children*0.2,mode = "markers")
#plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome, text = paste("Count: ", #count),mode = "markers", opacity=1-Children*0.2,color = Children)
Observations: 1. When work_hours smaller than 60Hrs, Avg Income tend to be positively related to work hour. 2. There are generally more females with no Children 3. Females with Children from 6 to 17 often has higher avg income